The aim of this notebook is to analyse and understand the impact COVID-19 has had globally, whilst learning and consolidating data visualisation techniques within python.
Import Relevant Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go
import math
import os
from datetime import datetime, timedelta
import plotly.offline as pyo
# Set notebook mode to work in offline
pyo.init_notebook_mode()
Date Formatting
dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d')
Read the data (csv files) using pandas
Understand what columns are available and the different data types
df_daily = pd.read_csv('COVID_Daily.csv',parse_dates=['date'], date_parser=dateparse)
df_daily.head()
df_daily.columns
df_daily.info()
# df_daily.describe
summary = pd.read_csv('COVID_Summary.csv')
summary.head()
summary.columns
summary.info()
# summary.describe
Add continent column from summary data to daily data to enable more visualisations and optimise data usability
df_daily['continent'] = df_daily.apply(lambda row:summary[summary.country == row.country].iloc[0].continent, axis=1)
df_daily.head()
Create function to add commas to the numbers
def count(num):
out = ""
counter = 0
for n in num[::-1]:
counter += 1
if counter == 4:
counter = 1
out = "," + out
out = n + out
return out
Scale Values
value = list(range(0,25,2))
log_scale = (np.exp2(value)).astype(int).astype(str)
log_scale = list(map(count, log_scale))
active_df = df_daily[['date', 'country', 'active_cases']].dropna().sort_values('date')
active_df = active_df[active_df.active_cases > 0]
active_df['log2(active_cases)'] = np.log2(active_df['active_cases'])
active_df['date'] = active_df['date'].dt.strftime('%m/%d/%Y')
fig = px.choropleth(active_df, locations="country", locationmode='country names',
color="log2(active_cases)", hover_name="country", hover_data=['active_cases'],
projection="natural earth", animation_frame="date",
title='<b>COVID-19 Global Active Cases Over Time</b>',
color_continuous_scale="inferno_r", # invert colour scale
)
# Values on colour bar
fig.update_layout(coloraxis={"colorbar": {"title":"Active Cases",
"tickvals":value,
"ticktext":log_scale}})
fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 1
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 1
fig.show()
Percentages hiding true values, as a data scientist we understand the full implications. More details within presentation.
pie1 = go.Pie(labels=['Total Active','Total Recovered', 'Total Deaths'],
values=[summary.active_cases.sum(), summary.total_recovered.sum(), summary.total_deaths.sum()],
title="<b> Total Recovered, Active Coronavirus Cases and Deaths</b> ",
marker=dict(colors=["lightgreen", "paleblue", "red"]),
)
vis_1=go.Figure(data=[pie1])
vis_1.show()
# Actual total number of Deaths
print(summary.total_deaths.sum())
fig = px.area(df_daily[df_daily.country.isin(df_daily[df_daily.date == df_daily.date.max()].sort_values("active_cases", ascending=False).iloc[:10].country)].sort_values("active_cases", ascending=False),
x="date", y="active_cases", color="country", template="plotly_white")#, groupnorm='percent')
fig.update_traces(line={"width":2})
fig.update_layout(title = f"Top 10 Countries with Most Active Cases on {df_daily.date.max().strftime('%Y-%m-%d')}",
xaxis={"title": "Date"},
yaxis={"title":"Active Cases"})
vis_2 = px.treemap(summary, path=["country"], values="total_deaths", height = 500,
title="Total Coronavirus Deaths by Country",
color_discrete_sequence = px.colors.qualitative.Set1)
vis_2.show()
vis_3 = px.treemap(summary, path=["country"], values="total_confirmed", height = 500,
title="Total Confirmed Coronavirs Cases by Country",
color_discrete_sequence = px.colors.qualitative.Set1)
vis_3.show()
summary['log(Total Deaths)'] = np.log2(summary['total_deaths'])
summary['Total Deaths'] = summary['total_deaths'].apply(lambda x: count(str(x)))
vis_4 = px.choropleth(summary,
locations="country",
color="log(Total Deaths)",
locationmode = 'country names',
hover_name='country',
hover_data=['Total Deaths'],
color_continuous_scale='rdylgn_r',
title = '<b>Coronavirus Deaths Around The Globe</b>')
vis_4.update_layout(title_font_size=15,
margin={"r":20, "l":30},
coloraxis={"colorbar":dict(title="<b>Total Deaths</b><br>",
tickvals=value,
ticktext=log_scale)})
vis_4.show()
summary['log(Total Confirmed)'] = np.log2(summary['total_confirmed'])
summary['Total Confirmed'] = summary['total_confirmed'].apply(lambda x: count(str(x)))
vis_5 = px.choropleth(summary,
locations="country",
color="log(Total Confirmed)",
locationmode = 'country names',
hover_name='country',
hover_data=['Total Confirmed'],
color_continuous_scale='rdylgn_r',
title = '<b>Coronavirus Confirmed Cases Around The Globe</b>')
vis_5.update_layout(title_font_size=15,
margin={"r":20, "l":30},
coloraxis={"colorbar":dict(title="<b> Total Confirmed</b><br>",
tickvals=value,
ticktext=log_scale)})
vis_5.show()
summary.continent.unique()
def deathsbycontinent(continent):
death_continent = df_daily[df_daily.continent == continent]
death_continentdf = death_continent.dropna()
vis_6 = px.line(death_continentdf, x="date", y="cumulative_total_deaths", color="country", #log_y=True,
line_group="country", hover_name="country", template="seaborn")
annotations = []
ann = []
for label in vis_6.select_traces():
ann.append(label.y[-1])
y_scale = 0.155 / max(ann)
for label in vis_6.select_traces():
# labeling the right_side of the plot
size = max(1, int(math.log(label.y[-1], 1.1) * label.y[-1] * y_scale))
annotations.append(dict(x=label.x[-1] + timedelta(hours=int((2 + size/5) * 24)), y=label.y[-1],
xanchor='left', yanchor='middle',
text=label.name,
font=dict(family='Arial',
size=7+int(size/2)),
showarrow=False))
vis_6.add_trace(go.Scatter(
x=[label.x[-1]],
y=[label.y[-1]],
mode='markers',
name=label.name,
marker=dict(color=label.line.color, size=size)
))
vis_6.update_traces(line={'width':2})
vis_6.update_layout(annotations=annotations, showlegend=True, uniformtext_mode='hide',
title=f"<b>Cumulative Total Coronavirus Deaths in {continent}<br>between {death_continentdf.date.min().strftime('%Y-%m-%d')} and {death_continentdf.date.max().strftime('%Y-%m-%d')}</b>",
xaxis={'title':'Date'},
yaxis={'title':'Coronavirus Confirmed Deaths'}
)
vis_6.show()
deathsbycontinent("Europe") # change name of continent here to analyse different results.
fig = px.treemap(summary, path=["country"], values="active_cases", height = 750,
title=f"<b>Active Cases Breakdown on {df_daily.date.max().strftime('%Y-%m-%d')}</b>",
color_discrete_sequence = px.colors.qualitative.Set1)
fig.update_traces(textinfo = "label+text+value") # create hover value
fig.show()
# summary.country.unique()
def countrystat(country):
if country in ["UK", "USA"]:
prefix = "The "
else:
prefix = ""
c = df_daily[df_daily.country == country]
c.set_index('date', inplace=True)
# 1. Cumulative total cases
if not all(c.cumulative_total_cases.isna()):
layout = go.Layout(yaxis={'range':[0, c.cumulative_total_cases[-1] * 1.05],'title':'Coronavirus Confirmed Cases'},xaxis={'title':''},)
fig = px.area(c, x=c.index, y="cumulative_total_cases",
title=f"Cumulative Total Confirmed Cases in {prefix}{country} from {c.index[0].strftime('%Y-%m-%d')} till {c.index[-1].strftime('%Y-%m-%d')}",
template='plotly')
fig.update_traces(line={'width':5})
fig.update_layout(layout)
fig.show()
# 2. Daily new cases with 7-day moving average
if not all(c.daily_new_cases.isna()):
layout = go.Layout(
yaxis={'range':[0, c.daily_new_cases.max() * 1.05],'title':'Daily New Coronavirus Confirmed Cases'},
xaxis={'title':''},
template='plotly',
title=f" Daily New Cases in {prefix}{country} from {c.index[0].strftime('%Y-%m-%d')} till {c.index[-1].strftime('%Y-%m-%d')} and showing a 7 daily moving average",
)
moving_average = c.daily_new_cases.rolling(7).mean().dropna().astype(int)
fig = go.Figure()
fig.add_trace(go.Bar(name="Daily Cases", x=c.index, y=c.daily_new_cases, marker_color='black'))
fig.add_trace(go.Scatter(name="Moving Average (7 Daily)", x=c.index[c.shape[0] - moving_average.shape[0]:], y=moving_average, line={'width':3, 'color':'green'}))
fig.update_layout(layout)
fig.show()
# 3. Daily new Deaths with 7-day moving average
if not all(c.daily_new_deaths.isna()):
layout = go.Layout(
yaxis={'range':[0, c.daily_new_deaths.max() * 1.05],
'title':'Daily New Coronavirus Deaths'},
xaxis={'title':''},
template='plotly',
title=f"Daily Deaths in {prefix}{country} from {c.index[0].strftime('%Y-%m-%d')} till {c.index[-1].strftime('%Y-%m-%d')}",
)
moving_average = c.daily_new_deaths.rolling(7).mean().dropna().astype(int)
fig = go.Figure()
fig.add_trace(go.Bar(name="Daily Deaths", x=c.index, y=c.daily_new_deaths, marker_color='black'))
fig.add_trace(go.Scatter(name="7-Day Moving Average", x=c.index[c.shape[0] - moving_average.shape[0]:], y=moving_average, line={'width':3, 'color':'red'}))
fig.update_layout(layout)
fig.show()
countrystat('Italy') # change country name here
# countrystat('Australia') # change country name here
sort = summary.sort_values(['total_deaths_per_1m_population'])
sort = sort[sort['total_deaths_per_1m_population'].notna()]
sort['% of Population with Coronavirus Death Cases'] = sort['total_deaths_per_1m_population']/1_000_000
mean = sort['% of Population with Coronavirus Death Cases'].mean()
sort['color'] = sort.apply(lambda row: "Red" if row['% of Population with Coronavirus Death Cases'] > mean else "Blue", axis=1)
#sorted_by_deaths_per_1m.dropna(inplace=True)
fig = px.scatter(sort, x='country', y='% of Population with Coronavirus Death Cases',
size='% of Population with Coronavirus Death Cases',
color='color',
title=f"<b>Coronavirus Death-Rate by Country as of {df_daily.date.max().strftime('%Y-%m-%d')}</b>",
height=650)
fig.update_traces(marker_line_color='rgb(75,75,75)',
marker_line_width=1.5, opacity=0.8,
hovertemplate="<b>%{x}</b><br>%{y} of Population with Death Cases<extra></extra>",)
fig.update_layout(showlegend=False,
yaxis={"tickformat":".3%", "range":[0,sort['% of Population with Coronavirus Death Cases'].max() * 1.1]},
xaxis={"title": ""},
title_font_size=20)
callout = ["China", "Australia", "India", "South Africa", "Russia", "Italy","Brazil", "UK", "France", "USA", "Bulgaria", "Peru"]
for i, country in enumerate(callout):
print
ay = 30 if i%2 else -30
ax = 20
if country == "Russia": ax = -20
if country == "Czech Republic": ay, ax = -30, -60
if country == "USA": ay = 50
if country == "Italy": ay, ax = 30, -20
if country == "UK": ay, ax = -30, 40
if country == "Australia": ay = -30
if country == "France": ay, ax = -60, -40
if country == "Brazil": ax = -20
if country == "Peru": ay = -30
fig.add_annotation(
x=country,
y=sort['% of Population with Coronavirus Death Cases'][sort.index[sort.country==country][0]],
xref="x",
yref="y",
text=country,
showarrow=True,
font=dict(
family="Courier New, monospace",
size=14,
color="#ffffff"
),
align="center",
arrowhead=2,
arrowsize=1,
arrowwidth=2,
arrowcolor="#636363",
ax=ax,
ay=ay,
bordercolor="#c7c7c7",
borderwidth=2,
borderpad=4,
bgcolor=sort['color'][sort.index[sort.country==country][0]],
opacity=0.6
)
fig.add_shape(type='line',
x0=sort['country'].iloc[0], y0=mean,
x1=sort['country'].iloc[-1], y1=mean,
line=dict(color='black',width=1),
xref='x', yref='y'
)
fig.add_annotation(x=sort['country'].iloc[0], y=mean,
text=f"mean = {mean*100:.2f}%",
showarrow=False,
xanchor="left",
yanchor="bottom",
font={"color":"black", "size":14}
)
fig.show()